STATS 506 Problem Set #5

Author

Haiming Li

OOP Programming

  1. Here’s the inclusion of gcd and lcm method using Rcpp
library(Rcpp)
cppFunction('
#include <numeric>
int gcd(int a, int b) {
  return std::gcd(a, b);
}')

cppFunction('
#include <numeric>
int lcm(int a, int b) {
  return std::lcm(a, b);
}
')

Here’s the class definition and class methods.

setClass('rational',
         slots = list(numerator = 'integer', 
                      denominator = 'integer'))

##' Constructor
##' @param numerator An integer value
##' @param denominator An none zero integer value
##' @return
##' @export
rational <- function(numerator = 0, denominator = 1) {
  if (!is.numeric(numerator) | !is.numeric(denominator)) {
    stop('both numerator and denominator must be integer')
  }
  if (numerator != as.integer(numerator) | 
      denominator !=as.integer(denominator)) {
    stop('both numerator and denominator cannot be floating point')
  }
  return(new('rational', numerator = as.integer(numerator), 
      denominator = as.integer(denominator)))
}

setValidity('rational', function(object){
  if (object@denominator == 0L) {
    stop('denominator cannot be 0')
  }
  return(TRUE)
})
Class "rational" [in ".GlobalEnv"]

Slots:
                              
Name:    numerator denominator
Class:     integer     integer
setMethod('show', 'rational',
          function(object) {
            cat(object@numerator, '/', object@denominator, '\n')
            return(invisible(object))
          })

setGeneric('simplify', function(object) standardGeneric('simplify'))
[1] "simplify"
setMethod('simplify', 'rational', function(object) {
  # compute the unsigned greatest common divisor
  divisor <- gcd(abs(object@numerator), abs(object@denominator))
  # simplify the fraction and sign
  new_num <- object@numerator / divisor
  new_den <- object@denominator / divisor
  if (new_num < 0 & new_den < 0) {
    new_num <- abs(new_num)
    new_den <- abs(new_den)
  }
  return(rational(new_num, new_den))
})

setGeneric('quotient', function(object, digits = 1) standardGeneric('quotient'))
[1] "quotient"
setMethod('quotient', 'rational', function(object, digits = 1) {
  # round() will floor the digits if it's not integer
  if (!is.numeric(digits) | digits < 0) {
    stop('digit must be none negative real number')
  }
  quotient_value <- object@numerator / object@denominator
  print(round(quotient_value, digits))
  return(invisible(quotient_value))
})

setMethod('+', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  denom <- lcm(e1@denominator, e2@denominator)
  numer <- (e1@numerator * (denom / e1@denominator)) + 
    (e2@numerator * (denom / e2@denominator))
  return(simplify(rational(numer, denom)))
})

setMethod('-', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  denom <- lcm(e1@denominator, e2@denominator)
  numer <- (e1@numerator * (denom / e1@denominator)) - 
    (e2@numerator * (denom / e2@denominator))
  return(simplify(rational(numer, denom)))
})

setMethod('*', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  numer <- e1@numerator * e2@numerator
  denom <- e1@denominator * e2@denominator
  return(simplify(rational(numer, denom)))
})

setMethod('/', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  numer <- e1@numerator * e2@denominator
  denom <- e1@denominator * e2@numerator
  if (denom == 0L) stop('division by zero')
  return(simplify(rational(numer, denom)))
})
  1. Here’s the demonstration
r1 <- rational(24, 6)
r2 <- rational(7, 230)
r3 <- rational(0, 4)
r1
24 / 6 
r3
0 / 4 
r1 + r2
927 / 230 
r1 - r2
913 / 230 
r1 * r2
14 / 115 
r1 / r2
920 / 7 
r1 + r3
4 / 1 
r1 * r3
0 / 1 
r2 / r3
Error in r2/r3: division by zero
quotient(r1)
[1] 4
quotient(r2)
[1] 0
quotient(r2, digits = 3)
[1] 0.03
quotient(r2, digits = 3.14)
[1] 0.03
quotient(r2, digits = 'avocado')
Error in quotient(r2, digits = "avocado"): digit must be none negative real number
q2 <- quotient(r2, digits = 3)
[1] 0.03
q2
[1] 0.03043478
quotient(r3)
[1] 0
simplify(r1)
4 / 1 
simplify(r2)
7 / 230 
simplify(r3)
0 / 1 
  1. Here’s the checking of malformed input to constructor
rational(1, 0)
Error in validityMethod(object): denominator cannot be 0
rational(3.14, 1)
Error in rational(3.14, 1): both numerator and denominator cannot be floating point
rational(1, 3.14)
Error in rational(1, 3.14): both numerator and denominator cannot be floating point
rational('A', 1)
Error in rational("A", 1): both numerator and denominator must be integer
rational(1, 'A')
Error in rational(1, "A"): both numerator and denominator must be integer

Plotly

  1. Here’s the recreation using plotly
library(tibble)
library(dplyr)
library(reshape2)
library(plotly)

# read & proces data (same as Pset4)
df <- as_tibble(read.csv('./df_for_ml_improved_new_market.csv'))
genre_columns <- grep('Genre___', colnames(df), value = TRUE)
genre_data <- df[, c('id', 'year', 'price_usd', genre_columns)] %>%
  melt(id.vars = c('id', 'year', 'price_usd')) %>%
  filter(value == 1) %>%
  mutate(genre=sub('Genre___', '', variable)) %>%
  select(id, year, price_usd, genre)

# Ex: If the genre is both 'Other' and 'Painting', the final genre should be
#     'Painting'
genre_priority <- c('Photography', 'Print', 'Sculpture', 'Painting', 'Others')
genre_data$genre <- factor(genre_data$genre, levels = genre_priority)
df <- genre_data[!duplicated(genre_data$id), 2:4]
genre_data <- df %>%
  count(year, genre) %>%
  group_by(year) %>%
  mutate(percent = n / sum(n))

plot_ly(data = genre_data, x = ~percent, y = ~year, color = ~genre,
        type = 'bar', orientation = 'h') %>%
  layout(
    barmode = 'stack',
    title = 'Genre Distribution Over Years',
    xaxis = list(title = 'Percentage'),
    yaxis = list(
      title = 'Year',
      tickmode = 'array',
      tickvals = unique(genre_data$year),
      ticktext = unique(genre_data$year),
      categoryorder = 'trace'
    ),
    legend = list(title = list(text = 'Genre'))
  )
  1. Here’s the interactive plot
plot_data <- df %>%
  group_by(year, genre) %>%
  summarize(avg_price_usd = mean(price_usd, na.rm = TRUE)) %>%
  ungroup()

traces <- list()

# overall trend
overall_trace <- plot_data %>%
  group_by(year) %>%
  summarize(avg_price_usd = mean(avg_price_usd, na.rm = TRUE))

traces[['Overall']] <- list(
  x = overall_trace$year,
  y = overall_trace$avg_price_usd,
  type = 'scatter',
  mode = 'lines+markers',
  name = 'Overall'
)

# genre-specific trend
genres <- unique(plot_data$genre)
for (genre in genres) {
  genre_trace <- plot_data %>% filter(genre == !!genre)
  
  traces[[genre]] <- list(
    x = genre_trace$year,
    y = genre_trace$avg_price_usd,
    type = 'scatter',
    mode = 'lines+markers',
    name = genre
  )
}

fig <- plot_ly()
for (trace_name in names(traces)) {
  fig <- fig %>%
    add_trace(
      x = traces[[trace_name]]$x,
      y = traces[[trace_name]]$y,
      type = traces[[trace_name]]$type,
      mode = traces[[trace_name]]$mode,
      name = trace_name,
      visible = ifelse(trace_name == 'Overall', TRUE, FALSE)
    )
}

# drop down menu
fig <- fig %>%
  layout(
    title = 'Change in Sales Price Over Time by Genre',
    xaxis = list(title = 'Year'),
    yaxis = list(title = 'Average Price (USD)'),
    updatemenus = list(list(
        buttons = lapply(names(traces), function(genre) {
          list(
            method = 'update',
            # only show selected button serie
            args = list(list(visible = sapply(names(traces), function(x) x == genre))),
            label = genre
          )}),
        direction = 'down',
        x = 0.08,
        y = 1.1
      )))
fig

data.table

# prepare data
library(nycflights13)
library(data.table)
data(flights); data(airports)
setDT(flights)
setDT(airports)
  1. first table
# summary statistics
table1 <- flights[, .(
  mean_delay = mean(dep_delay, na.rm = TRUE),
  median_delay = median(dep_delay, na.rm = TRUE),
  n_flights = .N
), by = origin][n_flights >= 10]
# join with origin for no match
table1 <- merge(table1[, faa := origin], airports,
                by = "faa", all.x = TRUE)
table1[, .(name, mean_delay, median_delay)][order(-mean_delay)]
                  name mean_delay median_delay
                <char>      <num>        <num>
1: Newark Liberty Intl   15.10795           -1
2: John F Kennedy Intl   12.11216           -1
3:          La Guardia   10.34688           -3

second table

# summary statistics
table2 <- flights[, .(
  mean_delay = mean(arr_delay, na.rm = TRUE),
  median_delay = median(arr_delay, na.rm = TRUE),
  n_flights = .N
), by = dest][n_flights >= 10]
table2 <- merge(table2[, faa := dest], airports,
                by = "faa", all.x = TRUE)
table2 <- table2[, name := ifelse(is.na(name), dest, name)]
table2 <- table2[, .(name, mean_delay, median_delay)][order(-mean_delay)]
print(table2, nrows = 102)
                                     name   mean_delay median_delay
                                   <char>        <num>        <num>
  1:                Columbia Metropolitan  41.76415094         28.0
  2:                           Tulsa Intl  33.65986395         14.0
  3:                    Will Rogers World  30.61904762         16.0
  4:                 Jackson Hole Airport  28.09523810         15.0
  5:                        Mc Ghee Tyson  24.06920415          2.0
  6:               Dane Co Rgnl Truax Fld  20.19604317          1.0
  7:                        Richmond Intl  20.11125320          1.0
  8:        Akron Canton Regional Airport  19.69833729          3.0
  9:                      Des Moines Intl  19.00573614          0.0
 10:                   Gerald R Ford Intl  18.18956044          1.0
 11:                      Birmingham Intl  16.87732342         -2.0
 12:         Theodore Francis Green State  16.23463687          1.0
 13: Greenville-Spartanburg International  15.93544304         -0.5
 14:    Cincinnati Northern Kentucky Intl  15.36456376         -3.0
 15:            Savannah Hilton Head Intl  15.12950601         -1.0
 16:          Manchester Regional Airport  14.78755365         -3.0
 17:                          Eppley Afld  14.69889841         -2.0
 18:                               Yeager  14.67164179         -1.5
 19:                     Kansas City Intl  14.51405836          0.0
 20:                          Albany Intl  14.39712919         -4.0
 21:                General Mitchell Intl  14.16722038          0.0
 22:                       Piedmont Triad  14.11260054         -2.0
 23:               Washington Dulles Intl  13.86420212         -3.0
 24:               Cherry Capital Airport  12.96842105        -10.0
 25:              James M Cox Dayton Intl  12.68048606         -3.0
 26:     Louisville International Airport  12.66938406         -2.0
 27:                  Chicago Midway Intl  12.36422360         -1.0
 28:                      Sacramento Intl  12.10992908          4.0
 29:                    Jacksonville Intl  11.84483416         -2.0
 30:                       Nashville Intl  11.81245891         -2.0
 31:                Portland Intl Jetport  11.66040210         -4.0
 32:               Greater Rochester Intl  11.56064461         -5.0
 33:      Hartsfield Jackson Atlanta Intl  11.30011285         -1.0
 34:                Lambert St Louis Intl  11.07846451         -3.0
 35:                         Norfolk Intl  10.94909344         -4.0
 36:            Baltimore Washington Intl  10.72673385         -5.0
 37:                         Memphis Intl  10.64531435         -2.5
 38:                   Port Columbus Intl  10.60132291         -3.0
 39:                  Charleston Afb Intl  10.59296847         -4.0
 40:                    Philadelphia Intl  10.12719014         -3.0
 41:                  Raleigh Durham Intl  10.05238095         -3.0
 42:                    Indianapolis Intl   9.94043412         -3.0
 43:            Charlottesville-Albemarle   9.50000000         -5.0
 44:               Cleveland Hopkins Intl   9.18161129         -5.0
 45:        Ronald Reagan Washington Natl   9.06695204         -2.0
 46:                      Burlington Intl   8.95099602         -4.0
 47:                 Buffalo Niagara Intl   8.94595186         -5.0
 48:                Syracuse Hancock Intl   8.90392501         -5.0
 49:                          Denver Intl   8.60650021         -2.0
 50:                      Palm Beach Intl   8.56297210         -3.0
 51:                                  BQN   8.24549550         -1.0
 52:                             Bob Hope   8.17567568         -3.0
 53:       Fort Lauderdale Hollywood Intl   8.08212154         -3.0
 54:                          Bangor Intl   8.02793296         -9.0
 55:           Asheville Regional Airport   8.00383142         -1.0
 56:                                  PSE   7.87150838          0.0
 57:                      Pittsburgh Intl   7.68099053         -5.0
 58:                       Gallatin Field   7.60000000         -2.0
 59:                 NW Arkansas Regional   7.46572581         -2.0
 60:                           Tampa Intl   7.40852503         -4.0
 61:               Charlotte Douglas Intl   7.36031885         -3.0
 62:             Minneapolis St Paul Intl   7.27016886         -5.0
 63:                      William P Hobby   7.17618819         -4.0
 64:                         Bradley Intl   7.04854369        -10.0
 65:                     San Antonio Intl   6.94537178         -9.0
 66:                      South Bend Rgnl   6.50000000         -3.5
 67:     Louis Armstrong New Orleans Intl   6.49017497         -6.0
 68:                        Key West Intl   6.35294118          7.0
 69:                        Eagle Co Rgnl   6.30434783         -4.0
 70:                Austin Bergstrom Intl   6.01990875         -5.0
 71:                   Chicago Ohare Intl   5.87661475         -8.0
 72:                         Orlando Intl   5.45464309         -5.0
 73:               Detroit Metro Wayne Co   5.42996346         -7.0
 74:                        Portland Intl   5.14157973         -5.0
 75:                        Nantucket Mem   4.85227273         -3.0
 76:                      Wilmington Intl   4.63551402         -7.0
 77:                    Myrtle Beach Intl   4.60344828        -13.0
 78:    Albuquerque International Sunport   4.38188976         -5.5
 79:         George Bush Intercontinental   4.24079040         -5.0
 80:        Norman Y Mineta San Jose Intl   3.44817073         -7.0
 81:               Southwest Florida Intl   3.23814963         -5.0
 82:                       San Diego Intl   3.13916574         -5.0
 83:              Sarasota Bradenton Intl   3.08243131         -5.0
 84:            Metropolitan Oakland Intl   3.07766990         -9.0
 85:   General Edward Lawrence Logan Intl   2.91439222         -9.0
 86:                   San Francisco Intl   2.67289152         -8.0
 87:                                  SJU   2.52052659         -6.0
 88:                         Yampa Valley   2.14285714          2.0
 89:              Phoenix Sky Harbor Intl   2.09704733         -6.0
 90:            Montrose Regional Airport   1.78571429        -10.5
 91:                     Los Angeles Intl   0.54711094         -7.0
 92:               Dallas Fort Worth Intl   0.32212685         -9.0
 93:                           Miami Intl   0.29905978         -9.0
 94:                       Mc Carran Intl   0.25772849         -8.0
 95:                  Salt Lake City Intl   0.17625459         -8.0
 96:                           Long Beach  -0.06202723        -10.0
 97:                Martha\\\\'s Vineyard  -0.28571429        -11.0
 98:                  Seattle Tacoma Intl  -1.09909910        -11.0
 99:                        Honolulu Intl  -1.36519258         -7.0
100:                                  STT  -3.83590734         -9.0
101:            John Wayne Arpt Orange Co  -7.86822660        -11.0
102:                    Palm Springs Intl -12.72222222        -13.5
                                     name   mean_delay median_delay
  1. Here’s the table
data(planes)
setDT(planes)

table3 <- merge(flights, planes,
                by = "tailnum", all.x = TRUE)
table3 <- table3[, mph := 60 * distance / air_time]
table3 <- table3[, .(
  avg_mph = mean(mph, na.rm = TRUE),
  n_flights = .N
), by = model]
table3[order(-avg_mph)][1]
     model  avg_mph n_flights
    <char>    <num>     <int>
1: 777-222 482.6254         4